0.1 Load Libraries

library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)
library(ggplot2)

0.2 Load and Merge Data

SC33 is the control while SC14 and SC15 are TiO2 and Asbestos respectively. Below, we modify cell names (barcodes) so they are all unique between samples. Here, we create a merged Seurat Object to identify clusters without using CCA.

SC33.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC33_multirun/filtered_gene_bc_matrices/mm10/")
SC33 <- CreateSeuratObject(raw.data = SC33.data, min.cells = 3, min.genes = 200, project = "SC33")
SC33@cell.names <- paste("SC33", SC33@cell.names, sep = "_")
colnames(x = SC33@raw.data) <- paste("SC33", colnames(x = SC33@raw.data), sep = "_")
rownames(x = SC33@meta.data) <- paste("SC33", rownames(x = SC33@meta.data), sep = "_")

SC14.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC14/filtered_gene_bc_matrices/mm10/")
lung <- AddSamples(object = SC33, new.data = SC14.data, add.cell.id = "SC14")
SC15.data <- Read10X(data.dir = "~/Box/Asbestos_SC/03_Analysis_V2/00_Asbestos/01_data/SC15/filtered_gene_bc_matrices/mm10/")
lung <- AddSamples(object = lung, new.data = SC15.data, add.cell.id = "SC15")

0.3 Data Cleaning and Inspection

In the beginning, we skip filtering and keep all cells, irrespecive of %mito and number of genes.

0.3.1 Normalize and find variable genes

lung <- NormalizeData(object = lung, normalization.method = "LogNormalize", scale.factor = 10000) #Log normalize
lung <- FindVariableGenes(object = lung, mean.function = ExpMean, dispersion.function = LogVMR) #Find variable genes

length(x = lung@var.genes) #958
## [1] 958
hv.genes <- head(rownames(lung@hvg.info), 958)

0.3.2 Data inspection and Scaling

mito.genes <- grep(pattern = "^mt-", x = rownames(x = lung@data), value = TRUE)
percent.mito <- Matrix::colSums(lung@raw.data[mito.genes, ])/Matrix::colSums(lung@raw.data)
lung <- AddMetaData(object = lung, metadata = percent.mito, col.name = "percent.mito")

lung <- ScaleData(object = lung, display.progress = T,vars.to.regress = c("nUMI"))

0.4 Perform PCA and Clustering

We examined our scree plot and manually inspected principal compenents for meaningful genes and clusters based on size. This led to the selection of the first 75 principal components as this is where we noted a decline in the quality of clusters being formed.

0.4.1 Heatmap and Scree Plot

## [1] "PC1"
## [1] "Ramp2"   "Cldn5"   "Cdh5"    "Egfl7"   "Tmem100"
## [1] ""
## [1] "Rpl17"   "Rps18"   "Cd74"    "H2-Aa"   "Ptprcap"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "Ager"   "Clic3"  "Cldn18" "Aqp5"   "Krt7"  
## [1] ""
## [1] "Gngt2"  "Wfdc17" "Rps18"  "Car4"   "Msrb1" 
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "Cd79a" "Cd79b" "Ly6d"  "Ebf1"  "Cd74" 
## [1] ""
## [1] "Msrb1"  "Lgals3" "Wfdc17" "Wfdc21" "Slpi"  
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "Igfbp5"   "Aldh1a1"  "Bgn"      "Serping1" "Mgp"     
## [1] ""
## [1] "Chil1"   "Lamp3"   "Slc34a2" "Sftpd"   "Hc"     
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "Bgn"      "Mgp"      "Ogn"      "Serping1" "Col3a1"  
## [1] ""
## [1] "Ccdc153"       "1110017D15Rik" "1700016K19Rik" "1700007K13Rik"
## [5] "Riiad1"       
## [1] ""
## [1] ""

0.4.2 Clustering and Visualization of Overall TSNE Plot

lung <- FindClusters(object = lung, reduction.type = "pca", dims.use = 1:75, resolution = 3, print.output = F, save.SNN = TRUE, force.recalc = T, n.start = 100)
## [1] "3 singletons identified. 49 final clusters."
lung <- RunTSNE(object = lung, dims.use = 1:75, do.fast = TRUE, check_duplicates = FALSE, perplexity = 30)
## Warning: package 'bindrcpp' was built under R version 3.4.4

##Validation of Clusters and Obtaining Marker Genes We finally used the top 50 genes in specific clusters to validate the identities of the clusters from our TSNE that stood out as they seemed to be split in unusual ways. Following this we found markers for each cluster and assigned identities to each cluster by manual expression of top marker genes.

TSNEPlot(object = lung, do.label = T)

lung <-ValidateSpecificClusters(lung, cluster1 = 0, cluster2 = 2, top.genes = 50) #B cells
## Warning: package 'caret' was built under R version 3.4.4
## [1] "Comparing cluster 0 and 2: Acc = 0.607916647434279"
## [1] "merge cluster 0 and 2"
TSNEPlot(object = lung, do.label = T)

lung <-ValidateSpecificClusters(lung, cluster1 = 1, cluster2 = 7, top.genes = 50) #AT2
## [1] "Comparing cluster 1 and 7: Acc = 0.998786513786514"
TSNEPlot(object = lung, do.label = T)

lung <-ValidateSpecificClusters(lung, cluster1 = 19, cluster2 = 7, top.genes = 50) #AT2
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## [1] "Comparing cluster 19 and 7: Acc = 0.997356254856255"
TSNEPlot(object = lung, do.label = T)

lung <-ValidateSpecificClusters(lung, cluster1 = 14, cluster2 = 7, top.genes = 50) #AT2
## [1] "Comparing cluster 14 and 7: Acc = 0.706970654534003"
## [1] "merge cluster 14 and 7"
TSNEPlot(object = lung, do.label = T)

lung <-ValidateSpecificClusters(lung, cluster1 = 12, cluster2 = 4, top.genes = 50) #AM
## [1] "Comparing cluster 12 and 4: Acc = 0.897936351068813"
## [1] "merge cluster 12 and 4"
TSNEPlot(object = lung, do.label = T) 

lung.markers <- FindAllMarkers(object = lung, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.5, max.cells.per.ident = 200)
#save(lung, file = "lung.clean.Robj")

0.4.3 Initial list of clusters based on examination of markers

C2 - B cells C3 - Classcial monocytes C4 - Alveolar macrophages C5 - Classcial monocytes C6 - T cells C7 - AT2 cells C8 - NK cells C9 - Endothelial cells (Cldn5, Kdr, Kitl, Igfbp7) C10 - Non-classical monocytes C11 - Neutrophils C13 - Endothelial cells (Cd93, Ptprb, Aqp1) C15 - T cells C16 - Peribronchial interstitial macrophages C17 - DC2 C18 - AT1 cells C20 - DC1 C21 - T cells C22 - T cells C23 - Ciliated cells C24 - Fibroblasts C25 - Neutrophils C26 - Club cells C27 - Ciliated cells C28 - pDC C29 - Doublets: Neutrophils and B cells C30 - Endothelial cells (C13) C31 - Endothelial cells (C13) C32 - Cycling AMs C33 - Mesothelial cells C34 - AT2 cells C35 - Smooth muscle cells C36 - AT2 cells C37 - Ccr7+ DC C38 - Doublets: AT2 and Neutrophils C39 - Doublets: B cells and monocytes C40 - Cell cycle/Damaged DC1 C41 - Cell cycle/Damaged T cells C42 - Mast cells/basophils C43 - Lymphatic progenitors C44 - Doublets: AM/AT2 C45 - Megakaryocytes C46 - Doublets: AM and Neutrophils C47 - Doublets: AM and NK cells

Composition of each cluster by library id:

0.5 Celltype Analysis (AT2 Cells)

After forming the initial Seurat Object consisting of these clusters and all three libraries we then proceeded to identify sub populations within clusters in an iterative process similar to what was previously done in the merged object. For the purposes of this markdown we show the process done for AT2 cells while other celltypes can be found in the full Rcode included (02_Asbestos_Celltype_Exploration.R).

0.5.1 Select Cluster of Interest thought to be AT2 based on markers:

AT2 <- SubsetData(lung, ident.use = c(7, 34, 36))
TSNEPlot(AT2, do.label = T)

TSNEPlot(AT2, do.label = T, group.by = "orig.ident")

AT2 <- FindVariableGenes(object = AT2, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

AT2 <- ScaleData(object = AT2, vars.to.regress = c("nUMI", "percent.mito"))
AT2 <- RunPCA(object = AT2, pc.genes = AT2@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5, pcs.compute = 50)

0.5.2 Examining Principle Components:

###Find Clusters and run TSNE:

AT2 <- FindClusters(object = AT2, reduction.type = "pca", dims.use = 1:15, resolution = 0.3, print.output = 0, save.SNN = TRUE, force.recalc = T)
PrintFindClustersParams(object = AT2)
## Parameters used in latest FindClusters calculation run on: 2018-06-06 16:19:36
## =============================================================================
## Resolution: 3
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 100             10
## -----------------------------------------------------------------------------
## Reduction used          k.param          k.scale          prune.SNN
##      pca                 30                25              0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
AT2 <- RunTSNE(object = AT2, dims.use = 1:12, do.fast = TRUE, check_duplicates = FALSE)

###Resolve ambiguities by checking for known contaminating cell type markers (NK, B, Tcell markers). Here we found sub clusters 3, 4 and 5 contain B, monocytes and T/NK cell doublets, so we removed them before confirming the other sub populations we found within AT2s via reclustering.

FeaturePlot(AT2, features.plot = c("Ccr2", "Cd3e","Nkg7", "Cd79a"))

AT2.markers <- FindAllMarkers(object = AT2, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.5, max.cells.per.ident = 200)

###Recheck variable genes and rescale and run PCA:

AT2 <- FindVariableGenes(object = AT2, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

length(x = AT2@var.genes)
AT2 <- ScaleData(object = AT2, vars.to.regress = c("nUMI"))
AT2 <- RunPCA(object = AT2, pc.genes = AT2@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5, pcs.compute = 20)

0.5.3 examine PCs:

###Recluster and run TSNE:

AT2 <- FindClusters(object = AT2, reduction.type = "pca", dims.use = 1:8, resolution = 0.4, print.output = 0, save.SNN = TRUE, force.recalc = T)
PrintFindClustersParams(object = AT2)
## Parameters used in latest FindClusters calculation run on: 2018-06-06 16:19:36
## =============================================================================
## Resolution: 3
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 100             10
## -----------------------------------------------------------------------------
## Reduction used          k.param          k.scale          prune.SNN
##      pca                 30                25              0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
AT2 <- RunTSNE(object = AT2, dims.use = 1:8, do.fast = TRUE, check_duplicates = FALSE, perplexity = 20)

0.5.4 Plot TSNE:

###Find marker genes and view heatmap of top 10 genes in each subcluster:

AT2.markers <- FindAllMarkers(object = AT2, only.pos = TRUE, min.pct = 0.5, thresh.use = 0.25, test.use = "wilcox")
write.csv(AT2.markers, "AT2.markers_PC1_8_res0.4wilcox.csv")

###Vizualize the library origin of cells across the subclusters